
###################################
###### 02_create_fulljury_data.R ##
###################################

# SETUP ####
# prevent scientific notation
options(scipen=10)
# load packages
library(pacman)
p_load(here, dplyr, ggplot2, stringr, vroom, readxl, antiword, purrr, 
       googlesheets4, magrittr, tidyverse, bit64, gridExtra, corrplot, fastDummies, estimatr)
pacman::p_load_gh("trinker/textreadr")
i_am("Code/02_create_fulljury_data.R")
# load word-to-number functions
source(here("Code", "word_to_number.R"))

# create jury_level.csv dataset ####

# identify and clean transcript names:
files <- data.frame(list.files(here("Jury transcripts"), pattern = ".docx"))
files <- cbind(files, 
               str_remove_all(str_remove(files$list.files.here..Jury.transcripts....pattern.....docx.., "corr2|Corr2"),"[qwertyuioplkjhgfdsazxcvbnmQWERTYUIOPLKJHGFDSAZXCVBNM-]"))
colnames(files) <- c("V1", "V2")
files$V2 <-str_remove_all(files$V2," |\\.")

# read in transcripts
juries <- data.frame("transcript_id_numbers"=NA, "text"=NA)
for (i in 1:nrow(files)) {
  #read in list w/ one speaking turn per row
  jury<-read_docx(here("Jury transcripts", files[i,1]))
  jury <- paste0(jury, collapse = " ")
  juries <- rbind(juries, c("transcript_id_numbers"=files$V2[i], "text"=jury))
}

# load in jury data

dat <- read.csv(here("Data", "JuryDataSummer2004.csv"))
# recode dollar preferences to ordinal scale
dat <- dat %>% mutate(or_idoll = case_when(idoll==0 ~ 0,
                                           idoll%in%c(1:50000)~1,
                                           idoll%in%c(50001:100000)~2,
                                           idoll%in%c(100001:200000)~3,
                                           idoll%in%c(200001:500000-1)~4,
                                           idoll%in%c(500000:750000)~5,
                                           idoll%in%c(750001:1000000)~6,
                                           idoll%in%c(1000001:3000000)~7,
                                           idoll > 3000000 ~8,
                                           TRUE ~ NA_real_))
# calculate jury-level demographic composition
dat <- dat %>%
  group_by(jurynum) %>%
  mutate(n_js = n(),
         whites = sum(white),
         white_ind = paste0(white, collapse = " "),
         nonwhites = sum(white==0),
         women = sum(sex==2),
         age_young = sum(age==1),
         age_med = sum(age %in% c(2,3,4)),
         age_old = sum(age %in% c(5,6)),
         ed_low = sum(educ %in% c(1,2)),
         ed_med = sum(educ==3),
         ed_hi = sum(educ %in% c(4,5,6)),
         inc_low = sum(income %in% c(1,2,3), na.rm=T),
         inc_med = sum(income==4, na.rm=T),
         inc_hi = sum(income %in% c(5,6), na.rm=T),
         inc_miss = sum(is.na(income)),
         doll_prefs = paste0(idoll, collapse = " "),
         scale_prefs = paste0(iscale, collapse = " "))
# keep only juries with six members
dat <- dat %>% filter(num_js==6)

# record scale of deliberations
dat <- dat %>%
  ungroup() %>%
  mutate(r2 = case_when(order=="A"~iscale,
                        order=="B"~as.integer(or_idoll))) %>%
  mutate(r1 = case_when(order=="A"~as.integer(or_idoll),
                        order=="B"~iscale))

# record pre-deliberation sd
dat <- dat %>%
  group_by(jurynum) %>%
  mutate(sd_pre = sd(r1, na.rm=T))

# select relevant variables
dat <- dat %>%
  select(jurynum, whites:scale_prefs, sd_pre, jrydoll, jryscale, hj1, hj2, order, scenario) %>%
  distinct()

# recode jury dollar verdicts to ordinal scale
dat <- dat %>% mutate(jury.eight = case_when(jrydoll==0 ~ 0,
                                             jrydoll>=1&jrydoll<=50000~1,
                                             jrydoll>=50001&jrydoll<=100000~2,
                                             jrydoll>=100001&jrydoll<=200000~3,
                                             jrydoll>=200001&jrydoll<500000~4,
                                             jrydoll>=500000&jrydoll<=750000~5,
                                             jrydoll>=750001&jrydoll<=1000000~6,
                                             jrydoll>=1000001&jrydoll<=3000000~7,
                                             jrydoll > 3000000 ~8,
                                             TRUE ~ NA_real_))
# recode verdicts by scale
dat <- dat %>%
  mutate(r2_decision = case_when(order=="A"~jryscale,
                                 order=="B"~jury.eight)) %>%
  mutate(r1_decision = case_when(order=="A"~jury.eight,
                                 order=="B"~jryscale))
# exclude obs. not with uniform order
dat <- dat %>% filter(order %in% c("A", "B"))

# read in RA-recorded juror information
jurorspeech <- read.csv(here("Data", "jurorspeech.csv"))

# clean transcript names for merge
jurorspeech$transcript_id_numbers<-str_remove_all(jurorspeech$transcript_id,"[qwertyuioplkjhgfdsazxcvbnmQWERTYUIOPLKJHGFDSAZXCVBNM:]")
jurorspeech$transcript_id_numbers<-str_remove_all(jurorspeech$transcript_id_numbers," ")
jurorspeech <- jurorspeech %>% select(jurynum, transcript_id_numbers, transcript_id)

# merge transcript linkage data data
dat <- left_join(dat, jurorspeech)
dat <- dat %>% distinct()


#read in tracking sheet to get non-codable transcripts 
tapes <- read_xlsx(here("Data", "transcript_coding.xlsx"))
# recode transcript names for merge
tapes$Name <- str_remove(tapes$Name, ".mp3")
tapes$Name <- str_remove(tapes$Name, "\\[[^\\]]*\\]")
tapes$Name <- str_replace_all(tapes$Name, "[[:punct:]]", " ")
tapes$Name <- str_replace_all(tapes$Name, "[a-zA-Z]", "")
tapes$Name <- str_remove_all(tapes$Name, " ")

# recode variables to record whether identifying was attempted
tapes <- tapes %>% rename("cant_code"=`Not coding for now`,
                          "transcript_id_numbers"=Name)
tapes <- tapes %>% mutate(attempt = case_when(cant_code=="X"~1,
                                              `Initial ID` %in% c("N/A", "NA", "x", "X")~1,
                                              !is.na(`Full Transcription`)~1,
                                              T~0))
# merge with full data
dat <- full_join(dat,
                 select(tapes, transcript_id_numbers, cant_code, attempt, Notes))

# load juror speech
ra_dat <- read.csv(here("Data", "jurorspeechtrial.csv"))
ra_dat <- ra_dat[!is.na(ra_dat$case_id),]
#only complete juries
ra_dat <- ra_dat %>% filter(num_js==6)

#recode un-identified jurors to blank Identifier variable
# same for duplicate labels within jury
duplabs <- ra_dat %>% filter(!is.na(Identifier)) %>% group_by(jurynum.x, Identifier) %>% summarize(n()) %>% filter(`n()`>1) %>% select(Identifier)
for(i in 1:nrow(duplabs)){
  ra_dat %<>% 
    mutate(Identifier = 
             case_when(jurynum.x == duplabs$jurynum.x[i] & 
                         Identifier == duplabs$Identifier[i]~NA_character_,
                       T~Identifier))
}

# record, for each jury, the number of people who could not be identified
ra_dat <- ra_dat %>% 
  group_by(jurynum.x) %>% 
  mutate(no_notes = sum(is.na(Notes))) %>%
  mutate(no_ids = sum(is.na(Identifier)))
ra_dat <- ra_dat %>% ungroup() %>% select(transcript_id_numbers, no_ids)
ra_dat %<>% distinct()

# merge with full data
dat <- full_join(dat, ra_dat)

# count transcripts checked and found unmatched as attempted
dat <- dat %>% mutate(attempt = case_when(transcript_id=="Unable to Match/Identify Transcript"~1,
                                          T~attempt))
dat %<>% distinct(jurynum, .keep_all=TRUE)

# export data
write.csv(dat, here("Data", "jury_level.csv"))


# process numbers ###################

juries <- left_join(juries, dat)

dat <- juries
rm(files, jurorspeech, ra_dat, duplabs, tapes)



#first, turn scale point names into numbers

dat$text <- str_replace(dat$text, "none|None", "0")
dat$text <- str_replace(dat$text, "mild|Mild", "2 ")
dat$text <- str_replace(dat$text, "substantial|Substantial", "4 ")
dat$text <- str_replace(dat$text, "extremely severe|Extremely severe", "8 ")
dat$text <- str_replace(dat$text, "severe|Severe", "6 ")


# separate deliberation by round
dat <- dat %>% separate(text, sep="Envelope #3", into=c("text1", "text2"), remove = FALSE, extra="merge")
dat$text1 <- gsub("^.*?Envelope #1","",dat$text1)
dat$text2 <- gsub("Envelope #4","",dat$text2)

# remove observations missing text
dat <- dat %>% filter(!is.na(dat$text))

# round 2: replace words with numbers when expressing preferences
dat$nums2 <- lapply(dat$text2, find_and_replace)
# note locations of preferences 
dat$nums2_loc <- lapply(dat$text2, function(x) find_and_replace(x, ret="loc"))

# repeat for round 1
  # remove this number, given as an example of an impossibly large amount, because it doesn't parse
dat$text1 <- str_remove(dat$text1, "nine hundred thousand million") 
# remove special character causing problems
dat$text1 <- str_remove_all(dat$text1,  "◄")

# replace words with numbers
dat$nums1 <- lapply(dat$text1, find_and_replace)
dat$nums1_loc <- lapply(dat$text1, function(x) find_and_replace(x, ret="loc"))

# convert positions to integers
dat$nums1_loc <- lapply(dat$nums1_loc, as.integer)
dat$nums2_loc <- lapply(dat$nums2_loc, as.integer)

# remove obs missing order 
dat <- dat %>% filter(!is.na(order))

#remove numbers outside of scale range for scale deliberations
  # check which numbers are inside range 
rem1 <- lapply(dat$nums1, function(x) x %in% seq(0, 8, by=.5))
rem2 <- lapply(dat$nums2, function(x) x %in% seq(0, 8, by=.5))

  # remove numbers not in range for only scale deliberations 
for(i in 1:nrow(dat)){
  if(dat$order[i]=="A"){
    dat$nums2[i] <- list(dat$nums2[[i]][which(rem2[[i]])])
    dat$nums2_loc[i] <- list(dat$nums2_loc[[i]][which(rem2[[i]])])
  }
  if(dat$order[i]=="B"){
    dat$nums1[i] <- list(dat$nums1[[i]][which(rem1[[i]])])
    dat$nums1_loc[i] <- list(dat$nums1_loc[[i]][which(rem1[[i]])]) 
  }
}

#remove 100 mil & 200 mil from dollar deliberations, as these are amounts given
# as company profits and are almost always discussed in that context
  # note these numbers
rem1 <- lapply(dat$nums1, function(x) !(x %in% c(100000000, 200000000)))
rem2 <- lapply(dat$nums2, function(x) !(x %in% c(100000000, 200000000)))

# remove 100 and 200mil and transform numbers that likely have different magnitudes
for(i in 1:nrow(dat)){
  rm(smalls)
  if(!str_detect(dat$doll_prefs[i], "NA")){
    if(dat$order[i]=="A"){
      if(max(as.numeric(unlist(str_split(dat$doll_prefs[i], " "))))<100000000){
        dat$nums1[[i]] <- list(dat$nums1[[i]][which(rem1[[i]])])
        dat$nums1_loc[[i]] <- list(dat$nums1_loc[[i]][which(rem1[[i]])])
      }
      smalls <- data.frame("small"=unlist(dat$nums1[[i]]))
      smalls$thou <- smalls$small*1000 
      smalls$mill <- smalls$small*1000000
      smalls$thou_in <- smalls$thou %in% unlist(dat$nums1[[i]])
      smalls$mill_in <- smalls$mill %in% unlist(dat$nums1[[i]])
      smalls$rep <- ifelse(smalls$thou_in==TRUE, smalls$thou,
                           ifelse(smalls$mill_in==TRUE, smalls$mill, smalls$small))
      dat$nums1[[i]] <- list(smalls$rep)
    }
    if(dat$order[i]=="B"){
      if(max(as.numeric(unlist(str_split(dat$doll_prefs[i], " "))))<100000000){
        dat$nums2[[i]] <- list(dat$nums2[[i]][which(rem2[[i]])])
        dat$nums2_loc[[i]] <- list(dat$nums2_loc[[i]][which(rem2[[i]])]) 
      }
      smalls <- data.frame("small"=unlist(dat$nums2[[i]]))
      smalls$thou <- smalls$small*1000 
      smalls$mill <- smalls$small*1000000
      smalls$thou_in <- smalls$thou %in% unlist(dat$nums2[[i]])
      smalls$mill_in <- smalls$mill %in% unlist(dat$nums2[[i]])
      smalls$rep <- ifelse(smalls$thou_in==TRUE, smalls$thou,
                           ifelse(smalls$mill_in==TRUE, smalls$mill, smalls$small))
      dat$nums2[[i]] <- list(smalls$rep)
    }
  }
}

save(dat, file=here("Data", "jury_with_numbers.RData"))
################################
# # create juror-level data ####

# load jury-level data
load(here("Data", "jury_with_numbers.RData"))
dat <- dat %>% distinct(jurynum, .keep_all=TRUE)

# load full survey/admin data
juror <- read.csv(here("Data", "JuryDataSummer2004.csv"))
# keep only 6-person juries and relevant variables
juror <- juror %>% 
  filter(num_js==6) %>%
  select(case_id, jurynum, white, iscale, idoll)

# record mentions by scale
dat$scale_mentions[dat$order=="A"] <- dat$nums2[dat$order=="A"]
dat$scale_mentions[dat$order=="B"] <- dat$nums1[dat$order=="B"]

dat$doll_mentions[dat$order=="A"] <- dat$nums1[dat$order=="A"]
dat$doll_mentions[dat$order=="B"] <- dat$nums2[dat$order=="B"]

dat$scale_mentions_loc[dat$order=="A"] <- dat$nums2_loc[dat$order=="A"]
dat$scale_mentions_loc[dat$order=="B"] <- dat$nums1_loc[dat$order=="B"]

dat$doll_mentions_loc[dat$order=="A"] <- dat$nums1_loc[dat$order=="A"]
dat$doll_mentions_loc[dat$order=="B"] <- dat$nums2_loc[dat$order=="B"]


# remove mentions not in prefs; remove 200k, 100m, and 200m
for(i in 1:nrow(dat)){
  doll_keep <- c(unlist(str_split(dat$doll_prefs[i], " ")))
  dat$doll_mentions[[i]] <- list(unlist(dat$doll_mentions[[i]])[which(unlist(dat$doll_mentions[[i]]) %in% doll_keep)])
  dat$doll_mentions_loc[[i]] <- list(unlist(dat$doll_mentions_loc[[i]])[which(unlist(dat$doll_mentions[[i]]) %in% doll_keep)])
  
  doll_cut <- c(200000, 100000000, 200000000)
  dat$doll_mentions[[i]] <- list(unlist(dat$doll_mentions[[i]])[which(!(unlist(dat$doll_mentions[[i]]) %in% doll_cut))])
  dat$doll_mentions_loc[[i]] <- list(unlist(dat$doll_mentions_loc[[i]])[which(!(unlist(dat$doll_mentions[[i]]) %in% doll_cut))])
  
  scale_keep <- c(unlist(str_split(dat$scale_prefs[i], " ")))
  dat$scale_mentions[[i]] <- list(unlist(dat$scale_mentions[[i]])[which(unlist(dat$scale_mentions[[i]]) %in% scale_keep)])
  dat$scale_mentions_loc[[i]] <- list(unlist(dat$scale_mentions_loc[[i]])[which(unlist(dat$scale_mentions[[i]]) %in% scale_keep)])
}

# count length of delib
dat$scale_length[dat$order=="A"] <- str_count(dat$text2[dat$order=="A"])
dat$scale_length[dat$order=="B"] <- str_count(dat$text1[dat$order=="B"])

dat$doll_length[dat$order=="A"] <- str_count(dat$text1[dat$order=="A"])
dat$doll_length[dat$order=="B"] <- str_count(dat$text2[dat$order=="B"])

# count total mentions 
dat$total_scale_mentions <- unlist(lapply(dat$scale_mentions, function(x) length(unlist(x))))
dat$total_doll_mentions <- unlist(lapply(dat$doll_mentions, function(x) length(unlist(x))))

# set mentions to 0 if length 0
dat$total_scale_mentions[is.na(dat$scale_length)] <- NA
dat$total_doll_mentions[is.na(dat$doll_length)] <- NA

# merge survey/admin and preference mention data
juror <- left_join(juror, 
                   dat %>%
                     select(jurynum, whites, nonwhites, jrydoll,
                            jryscale, order, scenario, 
                            scale_mentions, doll_mentions,
                            scale_length, doll_length,
                            scale_mentions_loc, doll_mentions_loc,
                            total_scale_mentions, total_doll_mentions),
                   by="jurynum")
juror <- juror %>% filter(whites+nonwhites==6)
juror <- juror %>% distinct(case_id, .keep_all=TRUE)

# recode dollar prefs/verdict to ordinal
juror <- juror %>% mutate(or_idoll = case_when(idoll==0 ~ 0,
                                               idoll%in%c(1:50000)~1,
                                               idoll%in%c(50001:100000)~2,
                                               idoll%in%c(100001:200000)~3,
                                               idoll%in%c(200001:500000-1)~4,
                                               idoll%in%c(500000:750000)~5,
                                               idoll%in%c(750001:1000000)~6,
                                               idoll%in%c(1000001:3000000)~7,
                                               idoll > 3000000 ~8,
                                               TRUE ~ NA_real_))
juror <- juror %>% mutate(or_jrydoll = case_when(jrydoll==0 ~ 0,
                                                 jrydoll%in%c(1:50000)~1,
                                                 jrydoll%in%c(50001:100000)~2,
                                                 jrydoll%in%c(100001:200000)~3,
                                                 jrydoll%in%c(200001:500000-1)~4,
                                                 jrydoll%in%c(500000:750000)~5,
                                                 jrydoll%in%c(750001:1000000)~6,
                                                 jrydoll%in%c(1000001:3000000)~7,
                                                 jrydoll > 3000000 ~8,
                                                 TRUE ~ NA_real_))


#count mentions of each juror's preference AND
#calculate other jurors' mean preferences

for(i in 1:nrow(juror)){
  # note jury mentions
  juror$num_scale_mentions[i] <- str_count(paste0(as.character(juror$scale_mentions[[i]]), collapse=" "), as.character(juror$iscale[i]))
  juror$num_doll_mentions[i] <- str_count(paste0(" ", as.character(unlist(juror$doll_mentions[[i]])), " ", collapse=" "), paste(" ", as.character(as.integer(juror$idoll[i])), " ", sep=""))
  
  # record mentions as NA if jury doesn't mention any
  if(length(unlist(juror$scale_mentions[i]))==0){
    juror$num_scale_mentions[i] <- NA
  }
  if(length(unlist(juror$doll_mentions[i]))==0){
    juror$num_doll_mentions[i] <- NA
  }
  
  # find mean of other preferences
  juror$others_mean_scale[i] <- mean(juror$iscale[juror$jurynum==juror$jurynum[i]&juror$case_id!=juror$case_id[i]])
  juror$others_mean_doll[i] <- mean(juror$idoll[juror$jurynum==juror$jurynum[i]&juror$case_id!=juror$case_id[i]])
  juror$others_mean_ordoll[i] <- mean(juror$or_idoll[juror$jurynum==juror$jurynum[i]&juror$case_id!=juror$case_id[i]])
}

# record mentions as missing if missing preference
juror$num_doll_mentions[is.na(juror$idoll)] <- NA
juror$num_scale_mentions[is.na(juror$iscale)] <- NA


# set mentions to NA (rather than 0) for 200k, 100m, 200m jurors
juror$num_doll_mentions[juror$idoll %in% c(200000, 100000000, 200000000)] <- NA

# identify dissenters
juror$dissenter_scale <- abs(juror$others_mean_scale - juror$iscale)>1.5
juror$dissenter_doll <- abs(juror$others_mean_ordoll - juror$or_idoll)>1.5

# note distance from verdict
juror$dist_doll <- abs(juror$jrydoll-juror$idoll)
juror$dist_ordoll <- abs(juror$or_jrydoll-juror$or_idoll)
juror$dist_scale <- abs(juror$jryscale-juror$iscale)

# note number of dissenters by jury
juror <- juror %>% 
  group_by(jurynum) %>%
  mutate(n_js=n(),
         n_scale_dissenters = sum(dissenter_scale),
         n_doll_dissenters = sum(dissenter_doll)) %>%
  filter(n_js==6)
# record number of other dissenters on jury
juror$n_scale_dissenters <- juror$n_scale_dissenters - juror$dissenter_scale
juror$n_doll_dissenters <- juror$n_doll_dissenters - juror$dissenter_doll

# record side of dissent
juror <- juror %>%
  mutate(side_scale = case_when(dissenter_scale==TRUE&iscale>others_mean_scale~"higher",
                                dissenter_scale==TRUE&iscale<others_mean_scale~"lower")) %>%
  mutate(side_doll = case_when(dissenter_doll==TRUE&or_idoll>others_mean_ordoll~"higher",
                               dissenter_doll==TRUE&or_idoll<others_mean_ordoll~"lower")) 

# record number of allies (dissenter, same side)
for(i in 1:nrow(juror)){
  if(juror$dissenter_scale[i]&!is.na(juror$dissenter_scale[i])){
    juror$allies_scale[i] <- nrow(juror[juror$jurynum==juror$jurynum[i]&juror$side_scale==juror$side_scale[i]&juror$dissenter_scale==TRUE,])-1
  }
  else{juror$allies_scale[i] <- NA}
  if(juror$dissenter_doll[i]&!is.na(juror$dissenter_doll[i])){
    juror$allies_doll[i] <- nrow(juror[juror$jurynum==juror$jurynum[i]&juror$side_doll==juror$side_doll[i]&juror$dissenter_doll==TRUE,])-1
  }
  else{juror$allies_doll[i] <- NA}
  
}

#check whether each juror's preference is unique, outside range of others'
for(i in 1:nrow(juror)){
  focal <- juror$iscale[i]
  others <- juror$iscale[juror$jurynum==juror$jurynum[i]&juror$case_id!=juror$case_id[i]]
  juror$outside_range_scale[i]  <- focal>max(others)|focal<min(others)
  juror$unique_scale[i] <- !(focal %in% others)
  
  focal <- juror$idoll[i]
  others <- juror$idoll[juror$jurynum==juror$jurynum[i]&juror$case_id!=juror$case_id[i]]
  juror$outside_range_doll[i]  <- focal>max(others)|focal<min(others)
  juror$unique_doll[i] <- !(focal %in% others)
}

#check number of others with same preference
for(i in 1:nrow(juror)){
  j_doll <- juror$idoll[juror$jurynum==juror$jurynum[i]]
  j_scale <- juror$iscale[juror$jurynum==juror$jurynum[i]]
  juror$num_others_doll[i] <- sum(j_doll==juror$idoll[i])-1
  juror$num_others_scale[i] <- sum(j_scale==juror$iscale[i])-1
}

#rerun total mentions after cleaning steps:
dat$total_scale_mentions <- unlist(lapply(dat$scale_mentions, function(x) length(unlist(x))))
dat$total_doll_mentions <- unlist(lapply(dat$doll_mentions, function(x) length(unlist(x))))

dat$total_scale_mentions[is.na(dat$scale_length)] <- NA
dat$total_doll_mentions[is.na(dat$doll_length)] <- NA

# add preference mention timing: when are doll/scale prefs first/last mentioned?

juror <- juror %>%
  mutate(first_doll_words = NA,
         last_doll_words = NA,
         first_doll_prefs = NA,
         last_doll_prefs=NA,
         first_scale_words=NA,
         last_scale_words=NA,
         first_scale_prefs=NA,
         last_scale_prefs=NA)

for(i in 1:nrow(juror)){
  if(juror$num_doll_mentions[i]>0&!is.na(juror$num_doll_mentions[i])){
  j_doll <- juror$idoll[i]
  j_doll_locs <- unlist(juror$doll_mentions_loc[i])[unlist(juror$doll_mentions[[i]])==j_doll]
  j_doll_prefs <- which(unlist(juror$doll_mentions[[i]])==j_doll)
  juror$first_doll_words[i] <- j_doll_locs[1]/juror$doll_length[i]
  juror$last_doll_words[i] <- j_doll_locs[length(j_doll_locs)]/juror$doll_length[i]
  juror$first_doll_prefs[i] <- j_doll_prefs[1]/juror$total_doll_mentions[i]
  juror$last_doll_prefs[i] <- j_doll_prefs[length(j_doll_prefs)]/juror$total_doll_mentions[i]
  }
  if(juror$num_scale_mentions[i]>0&!is.na(juror$num_scale_mentions[i])){
  j_scale <- juror$iscale[i]
  j_scale_locs <- unlist(juror$scale_mentions_loc[i])[unlist(juror$scale_mentions[[i]])==j_scale]
  j_scale_prefs <- which(unlist(juror$scale_mentions[[i]])==j_scale)
  juror$first_scale_words[i] <- j_scale_locs[1]/juror$scale_length[i]
  juror$last_scale_words[i] <- j_scale_locs[length(j_scale_locs)]/juror$scale_length[i]
  juror$first_scale_prefs[i] <- j_scale_prefs[1]/juror$total_scale_mentions[i]
  juror$last_scale_prefs[i] <- j_scale_prefs[length(j_scale_prefs)]/juror$total_scale_mentions[i]
  }
}



# merge in id info from RA codes 
ra_dat <- read.csv(here("Data", "jurorspeech.csv"))

#recode un-identified jurors to blank Identifier variable
# incl. duplicate labels within jury
duplabs <- ra_dat %>% filter(!is.na(Identifier)) %>% group_by(jurynum, Identifier) %>% summarize(n()) %>% filter(`n()`>1) %>% select(Identifier)
for(i in 1:nrow(duplabs)){
  ra_dat %<>% 
    mutate(Identifier = 
             case_when(jurynum == duplabs$jurynum[i] & 
                         Identifier == duplabs$Identifier[i]~NA_character_,
                       T~Identifier))
}

# clean transcript numbers
ra_dat$transcript_id_numbers<-str_remove_all(ra_dat$transcript_id,"[qwertyuioplkjhgfdsazxcvbnmQWERTYUIOPLKJHGFDSAZXCVBNM:]")
ra_dat$transcript_id_numbers<-str_remove_all(ra_dat$transcript_id_numbers," ")
ra_dat <- ra_dat %>% select(jurynum, transcript_id_numbers, transcript_id, case_id, Identifier)

# merge data
juror <- left_join(juror, ra_dat %>% select(jurynum, transcript_id, transcript_id_numbers) %>% distinct())
juror <- left_join(juror, ra_dat %>% select(case_id, Identifier))

save(juror, file=here("Data", "juror_level.RData"))

